********************************************************************************
* matching_firm.do
* Purpose: Match M&A firms with comparable non-M&A control firms using
*          propensity score matching (PSM), then construct the matched
*          firm event-study panel.
*
* Matching design:
*   - Treatment: firm undergoes an M&A event in year y+1
*   - Matching year window: 2004-2016 (firms need pre-event data from 2001+)
*   - Eligibility: firms with >= 10 average employees; non-missing revenue, age, wage
*   - Other M&A firms (treated in different years) are dropped from the control pool
*   - Matching variables: revenue (quadratic), age (quadratic), avg wage (quadratic)
*   - Cells: 2-digit NAICS x province x binned matching variables
*     (firms can only be matched to peers within the same cell)
*   - PSM: psmatch2, caliper 1, 1:1 nearest-neighbor without replacement
*
* The matched firm panel spans 2001-2017 and includes t = -6 to +5 relative to
* the event year (year_prior + 1). ds_6 (t = -1) is set to zero as the base period.
*
* Input : $data/firm_YYYY.dta (from clean_firm.do)
* Outputs:
*   $data/firm_matched_list.dta  - Matched pair IDs (one row per matched firm)
*   $data/firm_matched.dta       - Full event-study panel of matched firms
********************************************************************************

//------------------------------------------------------------------------------
// STEP 1: PROPENSITY SCORE MATCHING WITHIN EACH YEAR (firm_eligible_YYYY.dta)
//------------------------------------------------------------------------------
global start_year 	= 2004
global end_year 	= 2016

forvalues y = $start_year/$end_year {

	use $data/firm_`y', replace

	//------------------------------------------------------------------
	// Eligibility restrictions
	//------------------------------------------------------------------
	* Minimum size: at least 10 average employees in year y
	keep if PD7_AvgEmp_NonZero >= 10

	* Non-missing matching variables
	drop if mi(naics) | mi(total_revenue) | mi(age) | mi(avg_wage)

	//------------------------------------------------------------------
	// Treatment definition
	// treated == 1 for firms whose first M&A event happens in year y+1
	// (i.e., matched in the year before the deal)
	//------------------------------------------------------------------
	gen treated = (DEAL_YEAR == `y' + 1)

	* Remove other M&A firms from the control pool
	* (firms treated in any other year cannot serve as clean controls)
	drop if MnA_firm == 1 & treated == 0
	gsort -treated

	//------------------------------------------------------------------
	// Create covariate bins for exact matching within cells
	// avg_wage: 10 quantile bins; revenue and age: 15 quantile bins
	//------------------------------------------------------------------
	foreach var in avg_wage total_revenue age {
		if `var' == avg_wage {
			gquantiles 	bin_`var' = `var', xtile n(10)
		}
		else {
			gquantiles 	bin_`var' = `var', xtile n(15)
		}
	}

	* Cell = unique combination of sector x province x all covariate bins.
	* Matching is constrained within cells (exact matching on these dimensions).
	gegen cell = group(naics2 OPAddressProvince bin_*)
	drop if cell == .   // drop firms with missing cell assignment

	//------------------------------------------------------------------
	// Estimate propensity score using a linear probability model
	// with quadratic terms for each matching variable.
	//------------------------------------------------------------------
	reg treated c.total_revenue##c.total_revenue c.age##c.age c.avg_wage##c.avg_wage

	* Record R2 statistics to summarize PSM model fit across years
    local r2alist 	"`r2alist' `e(r2_a)'"
	local r2list	"`r2list' `e(r2)'"
	dis `r2alist'
	dis `r2list'

	predict pscore   // fitted propensity score

	* Save the original (unadjusted) pscore for diagnostics
	gen pscore_og = pscore

	* Add a large cell-specific offset to enforce within-cell matching:
	* firms in different cells will never be matched (their pscore gap > caliper)
	replace pscore = pscore + 10.0*cell

	//------------------------------------------------------------------
	// Sort firms by ID for replication (psmatch2 is sort-order sensitive).
	// The random outcome variable ensures tie-breaking is reproducible.
	//------------------------------------------------------------------
	gen outcome = rnormal()   // placeholder outcome (not used for analysis)
	gsort entid_syn

	* 1:1 nearest-neighbor PSM without replacement, caliper of 1 pscore unit
	psmatch2 treated, outcome(outcome) pscore(pscore) caliper(1) n(1) noreplacement

	* Construct a pair ID linking each treated firm to its matched control
	* _id = observation number (for controls), _n1 = matched pair's obs number
	gen		 id = _id if treated==0 & _weight==1
	replace	 id = _n1 if treated==1 & _weight==1
	gegen pairid = group(id)
	gsort pairid

	gen year_prior = `y'

	save $data/firm_eligible_`y'.dta, replace
}


//------------------------------------------------------------------------------
// STEP 2: SUMMARIZE PSM MODEL FIT ACROSS YEARS
//
// Converts the accumulated R2 lists from the loop into a dataset
// and reports the average R2 across all matching years.
//------------------------------------------------------------------------------
clear
set obs `: word count `r2alist''

gen adjR2 = .
local i = 1
foreach r2 of local r2alist {
    replace adjR2 = `r2' in `i'
    local ++i
}

gen R2 = .
local i = 1
foreach r2 of local r2list {
    replace R2 = `r2' in `i'
    local ++i
}

summ R2
scalar avg_R2 = r(mean)
summ adjR2
scalar avg_adjR2 = r(mean)

display "Average R2 across matching years = " avg_R2
display "Average adjusted R2 across matching years = " avg_adjR2


//------------------------------------------------------------------------------
// STEP 3: COMPILE MATCHED FIRM LIST (firm_matched_list.dta)
//
// Extracts the matched pairs from each year's eligible file and stacks them.
//------------------------------------------------------------------------------
forvalues y = $start_year/$end_year {

	use $data/firm_eligible_`y', replace

	* Keep only successfully matched firms (id is missing for unmatched units)
	drop if mi(id)
	keep entid_syn treated pairid year_prior pscore_og

	save $data/firm_matched_list_`y'.dta, replace
}

drop _all
forvalues y = $start_year/$end_year {
	append using $data/firm_matched_list_`y', keep(entid_syn pairid treated year_prior pscore_og) force
}
save $data/firm_matched_list, replace


//------------------------------------------------------------------------------
// STEP 4: BUILD THE FULL MATCHED FIRM PANEL (2001-2017)
//
// For each year, merge firm accounts (firm_YYYY) with the matched list.
// Only matched treated and control firms are retained (keep(3)).
// Stack all years into a single intermediate panel file.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data/firm_`y', clear

	* Keep only matched treated and control firms
	merge 1:m entid_syn using $data/firm_matched_list, keep(3) nogen

	save $data/firm_matched_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/firm_matched_`y', force
	rm $data/firm_matched_`y'.dta
}

save $data/firm_matched_intermid, replace


//------------------------------------------------------------------------------
// STEP 5: CONSTRUCT THE EVENT-STUDY PANEL (firm_matched.dta)
//
// Adds all variables needed for the event-study regressions:
//   - Unique firm and pair IDs
//   - Event time variable t (years relative to event) and ds_k dummies
//   - Sector (NAICS2) and age polynomials for regression controls
//   - Pair-level variables: acquirer flag, deal type, sector
//   - Drop financial sector deals (NAICS2 == 52 or 55)
//------------------------------------------------------------------------------
gsort pairid year_prior -treated year

* Numeric IDs for regression clustering and panel setup
gegen 	firm_id     = group(entid_syn)
gegen 	id 			= group(firm_id year_prior)    // unique firm x matching-year ID

* Set deal characteristics to missing in non-event years
* (ensures DEAL, Acquirer, DEAL_YEAR are only populated in the matching year)
replace DEAL 	 = . if year ~= year_prior
replace Acquirer = . if year ~= year_prior
replace DEAL_YEAR= . if year ~= year_prior

//------------------------------------------------------------------
// Event time variable and indicator dummies
// t = year - (year_prior + 1): t = 0 is the first post-event year
// t is capped at [-6, +5] to keep the panel balanced at the tails
// ds_1..ds_13 are indicator dummies for each event-time bin
// ds_6 (t = -1, year before the event) is set to zero: base period
//------------------------------------------------------------------
gen 	t = year - (year_prior + 1)
replace t = 6 	if t >  5 	& ~mi(t)
replace t = -6 	if t < -5 	& ~mi(t)
tab t, gen(ds_)
levelsof t, local(ts)
local end = r(r)
forvalue i = 1(1)`end' {
	local temp = `i' - 7
	label variable ds_`i' "`temp'"
}
replace ds_6 = 0    // t = -1 is the omitted base period

//------------------------------------------------------------------
// Sector variable: fill in missing NAICS values with firm modal sector
//------------------------------------------------------------------
drop naics2
gen naics_og = naics

* Use the most common NAICS code for the firm to fill gaps
gegen	naics_mode	= mode(naics), by(entid_syn) minmode
replace naics		= naics_mode	if mi(naics)
gen 	naics2 		= int(naics/100)
replace naics2  	= 31 if naics2 == 32 | naics2 == 33
replace naics2  	= 44 if naics2 == 45
replace naics2  	= 48 if naics2 == 49
replace naics2 		= 54 if naics2 == 56 | naics2 == 61 | naics2 == 62

* Record the sector as of the year before the event (used as a control)
gen 	naics2_event	= naics2 if treated == 1 & year == year_prior

//------------------------------------------------------------------
// Firm age: fill in missing age values using incorporation date at event year
// Age polynomials (age^2, age^3, age^4) used as regression controls
//------------------------------------------------------------------
drop 	dateinc dateinc_year age
gen 	temp_dateinc 	= dofc(BirthDate) if year == year_prior
gegen	dateinc 		= firstnm(temp_dateinc), by(id)
gen 	dateinc_year 	= yofd(dateinc)
gen 	age 			= year - dateinc_year
gen 	age2 			= age^2
gen 	age3 			= age^3
gen 	age4 			= age^4

* Acquirer and deal type: only meaningful at event year
replace Acquirer 	= . if ~(treated == 1 & year == year_prior)
replace merger 		= . if ~(treated == 1 & year == year_prior)

//------------------------------------------------------------------
// Pair-level variables: carried from the treated firm to both firms in the pair.
// These are used to define subsamples in analyze_main.do.
//   matched_acq       = 1 if the treated firm in the pair is an acquirer
//   matched_deal_type = 1 if the deal is a merger (0 = acquisition)
//   matched_sector    = 2-digit NAICS of the treated firm at event year
//------------------------------------------------------------------
gsort pairid year_prior -treated year
gegen matched_acq 		= firstnm(Acquirer), 			by(pairid year_prior)
gegen matched_deal_type = firstnm(merger),				by(pairid year_prior)
gegen matched_sector 	= firstnm(naics2_event),		by(pairid year_prior)

* Drop deals in financial (NAICS2 == 52) and management holding (NAICS2 == 55) sectors
drop if matched_sector == 52 | matched_sector == 55

compress
save $data/firm_matched, replace
